library("dplyr")
df_va_original <- read.csv("./IHME_PHMRC_VA_DATA_CHILD_Y2013M09D11_0.csv")

Some of the columns were dropped as they are irrelevant/duplicated or not informative (for example, the frequency of the words doesn’t provide much information without a context)

# drop columns that are not necessary
df_va <- subset(df_va_original, select = -c(gs_code34,va34,gs_code46,gs_text46,va46,gs_code55,gs_text55,va55,gs_comorbid1,gs_comorbid2,gs_level,g1_01d,g1_01m,g1_01y,g1_05,g1_06d,g1_06m,g1_06y,g1_07a,g1_07b,g1_07c,g1_08,g1_09,g1_10,g2_01,g2_02,g2_03ad,g2_03am,g2_03ay,g2_03bd,g2_03bm,g2_03by,g2_03cd,g2_03cm,g2_03cy,g2_03dd,g2_03dm,g2_03dy,g2_03ed,g2_03em,g2_03ey,g2_03fd,g2_03fm,g2_03fy,g3_01,g4_02,g4_03a,g4_03b,g4_04,g4_05,g5_05,g5_06a,g5_06b,g5_07,g5_08,word_diseas,word_final,word_child,word_condit,word_digest,word_glucos,word_bodi,word_tetanus,word_hand,word_failur,word_reduc,word_son,word_breath,word_look,word_till,word_spot,word_proper,word_medic,word_found,word_girl,word_ray,word_babi,word_privat,word_reason,word_poison,word_bring,word_renal,word_rash,word_healthi,word_interview,word_acquir,word_accid,word_test,word_cough,word_respiratori,word_mother,word_traffic,word_hospit,word_come,word_abdomen,word_abl,word_result,word_pregnanc,word_suffer,word_check,word_pass,word_famili,word_die,word_road,word_colleg,word_dengu,word_doctor,word_drown,word_tumor,word_cardio,word_eat,word_fall,word_examin,word_acidosi,word_fire,word_fit,word_sepsi,word_nilouf,word_happen,word_head,word_headach,word_skin,word_blood,word_home,word_hypertens,word_immedi,word_bad,word_inject,word_left,word_leg,word_daughter,word_malnutrit,word_milk,word_clock,word_anemia,word_nilof,word_complain,word_hour,word_nurs,word_snake,word_pain,word_pneumonia,word_polic,word_provid,word_boy,word_recov,word_malaria,word_asthma,word_explain,word_scan,word_gandhi,word_born,word_lung,word_stomach,word_difficulti,word_weak,word_client,word_time,word_told,word_transfus,word_treat,word_unconsci,word_water,word_loos,word_week,word_stool,word_ill,word_lot,word_jaundic,word_communiti,word_health,word_deliv,word_drink,word_servic,word_fine,word_eye,word_particip,word_money,word_chest,word_increas,word_live,word_expir,word_normal,word_brain,word_stay,word_urin,word_remov,word_admit,word_bite,word_center,word_measl,word_kept,word_especi,word_neck,word_serious,word_due,word_care,word_day,word_pox,word_hiv,word_icu,word_start,word_nose,word_leukemia,word_caus,word_near,word_morn,word_vomit,word_accord,word_gastric,word_receiv,word_coma,word_father,word_clinic,word_emerg,word_month,word_birth,word_treatment,word_sick,word_dehydr,word_prescrib,word_children,word_motion,word_refer,word_ward,word_certif,word_advis,word_hous,word_medicin,word_play,word_heart,word_diarrhea,word_baby,word_mouth,word_sever,word_shock,word_dead,word_oper,word_night,word_indraw,word_provinci,word_cancer,word_brought,word_even,word_convuls,word_addit,word_deceas,word_take,word_oxygen,word_infect,word_cold,word_misplac,word_swell,word_respond,word_transfer,word_thank,word_cri,word_sudden,word_continu,word_sent,word_stop,word_get,word_fever,word_notic,word_hole,word_kidney,word_bluish,word_yellow,word_injuri,word_pulmonari,c6_11,c6_12,c6_13,c6_14,newid,g5_02,module))
str(df_va)
## 'data.frame':    2064 obs. of  146 variables:
##  $ site     : chr  "AP" "Dar" "UP" "Pemba" ...
##  $ gs_text34: chr  "Bite of Venomous Animal" "Malaria" "Measles" "Pneumonia" ...
##  $ g4_06    : int  4 4 9 4 5 13 2 4 3 2 ...
##  $ g4_07    : int  2 3 3 2 7 1 2 2 2 1 ...
##  $ g4_08    : chr  "No" "Yes" "Yes" "Yes" ...
##  $ g5_01d   : chr  "20" "Don't Know" "6" "Don't Know" ...
##  $ g5_01m   : chr  "March" "Don't Know" "May" "Don't Know" ...
##  $ g5_01y   : chr  "2005" "2002" "2004" "Don't Know" ...
##  $ g5_03d   : chr  "10" "7" "4" "19" ...
##  $ g5_03m   : chr  "August" "April" "December" "March" ...
##  $ g5_03y   : int  2009 2009 2009 2009 2007 2009 2009 2009 2009 2009 ...
##  $ g5_04a   : int  4 7 5 1 6 NA NA NA NA 7 ...
##  $ g5_04b   : int  NA NA NA NA NA 5 10 11 11 NA ...
##  $ g5_04c   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ c1_01    : chr  "Multiple" "Singleton" "Singleton" "Singleton" ...
##  $ c1_02    : chr  "Second" "Don't Know" "Don't Know" "Don't Know" ...
##  $ c1_03    : chr  "" "Yes" "" "Yes" ...
##  $ c1_04    : chr  "Don't Know" "Don't Know" "Don't Know" "Don't Know" ...
##  $ c1_05    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ c1_06a   : chr  "Hospital" "Hospital" "Home" "Home" ...
##  $ c1_07    : chr  "About average" "About average" "About average" "Very small" ...
##  $ c1_08a   : chr  "" "Don't Know" "Grams" "Don't Know" ...
##  $ c1_08b   : num  3000 NA 3500 NA NA ...
##  $ c1_09    : chr  "Female" "Female" "Female" "Male" ...
##  $ c1_10    : chr  "" "Specified" "Specified" "Don't Know" ...
##  $ c1_10d   : chr  "20" "Don't Know" "6" "" ...
##  $ c1_10m   : chr  "March" "Don't Know" "May" "" ...
##  $ c1_10y   : chr  "2005" "2002" "2004" "" ...
##  $ c1_11    : chr  "Alive" "Alive" "Alive" "Alive" ...
##  $ c1_12    : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ c1_13    : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ c1_14    : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ c1_15    : chr  "No" "No" "No" "No" ...
##  $ c1_16    : chr  "No" "No" "No" "No" ...
##  $ c1_17    : chr  "No" "No" "No" "No" ...
##  $ c1_18    : chr  "No" "No" "No" "No" ...
##  $ c1_19_1  : chr  "No" "No" "No" "No" ...
##  $ c1_19_2  : chr  "No" "No" "No" "No" ...
##  $ c1_19_3  : chr  "No" "No" "No" "No" ...
##  $ c1_19_4a : chr  "No" "No" "No" "No" ...
##  $ c1_19_4b : logi  NA NA NA NA NA NA ...
##  $ c1_19_5  : chr  "No" "No" "No" "No" ...
##  $ c1_19_6  : chr  "No" "No" "No" "No" ...
##  $ c1_20    : int  0 0 1825 0 0 150 300 300 270 0 ...
##  $ c1_21    : int  0 30 3 8 30 7 3 30 60 0 ...
##  $ c1_22a   : chr  "Hospital" "Hospital" "Home" "Hospital" ...
##  $ c1_24    : chr  "Specified" "Specified" "Specified" "Specified" ...
##  $ c1_24d   : chr  "10" "7" "4" "19" ...
##  $ c1_24m   : chr  "August" "April" "December" "March" ...
##  $ c1_24y   : chr  "2009" "2009" "2009" "2009" ...
##  $ c1_25    : int  0 0 1825 0 0 150 300 330 330 0 ...
##  $ c1_26    : chr  "28 days to 11 years" "28 days to 11 years" "28 days to 11 years" "28 days to 11 years" ...
##  $ c4_01    : chr  "No" "Yes" "Yes" "Yes" ...
##  $ c4_02    : int  0 30 3 4 12 7 2 20 20 0 ...
##  $ c4_03    : chr  "No" "Yes" "Yes" "Yes" ...
##  $ c4_04    : chr  "Don't Know" "Severe" "Severe" "Severe" ...
##  $ c4_05    : chr  "Don't Know" "Continuous" "Continuous" "Continuous" ...
##  $ c4_06    : chr  "No" "Yes" "Yes" "No" ...
##  $ c4_07a   : chr  "Don't Know" "Specified" "Specified" "Don't Know" ...
##  $ c4_07b   : int  0 4 4 0 0 3 3 10 4 0 ...
##  $ c4_08    : int  0 14 0 0 0 6 1 15 7 0 ...
##  $ c4_09    : chr  "No" "Yes" "No" "No" ...
##  $ c4_10    : int  0 0 0 0 0 5 0 15 7 0 ...
##  $ c4_11    : chr  "No" "No" "No" "No" ...
##  $ c4_12    : chr  "No" "No" "No" "No" ...
##  $ c4_13    : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ c4_14    : chr  "No" "No" "No" "No" ...
##  $ c4_15    : chr  "No" "No" "No" "No" ...
##  $ c4_16    : chr  "Yes" "No" "No" "Yes" ...
##  $ c4_17    : int  0 0 0 8 0 7 0 1 4 0 ...
##  $ c4_18    : chr  "Yes" "No" "No" "Yes" ...
##  $ c4_19    : int  0 0 0 3 0 0 0 0 1 0 ...
##  $ c4_20    : chr  "No" "No" "No" "Yes" ...
##  $ c4_22    : chr  "No" "No" "No" "No" ...
##  $ c4_23    : chr  "No" "No" "No" "No" ...
##  $ c4_24    : chr  "No" "No" "No" "Yes" ...
##  $ c4_25    : chr  "No" "No" "No" "No" ...
##  $ c4_26    : chr  "Yes" "No" "No" "No" ...
##  $ c4_27    : chr  "<6 hours" "Don't Know" "Don't Know" "Don't Know" ...
##  $ c4_28    : chr  "No" "No" "No" "Yes" ...
##  $ c4_29    : chr  "No" "No" "No" "Yes" ...
##  $ c4_30    : chr  "No" "No" "No" "No" ...
##  $ c4_31_1  : chr  "Don't Know" "Don't Know" "Don't Know" "Don't Know" ...
##  $ c4_31_2  : chr  "Don't Know" "Don't Know" "Don't Know" "Don't Know" ...
##  $ c4_32    : chr  "Don't Know" "Don't Know" "Don't Know" "Don't Know" ...
##  $ c4_33    : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ c4_34    : chr  "No" "No" "No" "No" ...
##  $ c4_35    : chr  "No" "No" "No" "Yes" ...
##  $ c4_36    : chr  "Yes" "No" "No" "No" ...
##  $ c4_37    : int  0 0 0 0 0 0 0 0 3 0 ...
##  $ c4_38    : chr  "No" "No" "No" "No" ...
##  $ c4_39    : chr  "No" "No" "No" "No" ...
##  $ c4_40    : chr  "Yes" "No" "No" "No" ...
##  $ c4_41    : chr  "No" "No" "No" "No" ...
##  $ c4_42    : chr  "No" "No" "No" "No" ...
##  $ c4_43    : chr  "No" "No" "No" "No" ...
##  $ c4_44    : chr  "No" "No" "No" "No" ...
##  $ c4_45    : chr  "" "" "" "" ...
##  $ c4_46    : chr  "No" "No" "No" "No" ...
##   [list output truncated]

transform data type: change the characters into categorical variables

df_va[sapply(df_va, is.character)] <- lapply(df_va[sapply(df_va, is.character)], 
                                                           as.factor)

Compute table distribution for all the categorical variables

for (i in 1:ncol(df_va)){
  if (is.factor(df_va[,i])){
    table <- data.frame(table(df_va[,i]))
    colnames(table) <- c(names(df_va)[i],"Freq")
    print(table)
    }
}
##     site Freq
## 1     AP  449
## 2  Bohol  262
## 3    Dar  467
## 4 Mexico  126
## 5  Pemba  261
## 6     UP  499
##                               gs_text34 Freq
## 1                                  AIDS   20
## 2               Bite of Venomous Animal   54
## 3                    Diarrhea/Dysentery  256
## 4                              Drowning   83
## 5                          Encephalitis   41
## 6                                 Falls   49
## 7                                 Fires   68
## 8                     Hemorrhagic fever   51
## 9                               Malaria  116
## 10                              Measles   23
## 11                           Meningitis   58
## 12                        Other Cancers   28
## 13        Other Cardiovascular Diseases   76
## 14 Other Defined Causes of Child Deaths  194
## 15             Other Digestive Diseases   48
## 16            Other Infectious Diseases   67
## 17                            Pneumonia  532
## 18                           Poisonings   18
## 19                         Road Traffic   92
## 20                               Sepsis  138
## 21                        Violent Death   52
##   g4_08 Freq
## 1          2
## 2    No  851
## 3   Yes 1211
##        g5_01d Freq
## 1              182
## 2           1   57
## 3          10   82
## 4          11   55
## 5          12   64
## 6          13   37
## 7          14   40
## 8          15   47
## 9          16   50
## 10         17   62
## 11         18   54
## 12         19   51
## 13          2   57
## 14         20   49
## 15         21   34
## 16         22   58
## 17         23   40
## 18         24   42
## 19         25   37
## 20         26   49
## 21         27   50
## 22         28   46
## 23         29   43
## 24          3   57
## 25         30   42
## 26         31   30
## 27          4   58
## 28          5   57
## 29          6   47
## 30          7   53
## 31          8   59
## 32          9   61
## 33 Don't Know  314
##        g5_01m Freq
## 1              182
## 2    November   89
## 3       April  156
## 4      August  161
## 5    December  114
## 6  Don't Know  259
## 7    February  139
## 8     January  123
## 9        July  142
## 10       June  146
## 11      March  154
## 12        May  133
## 13    October  142
## 14  September  124
##        g5_01y Freq
## 1              182
## 2        1995    1
## 3        1996    3
## 4        1997   19
## 5        1998   71
## 6        1999   61
## 7        2000   59
## 8        2001   60
## 9        2002   60
## 10       2003   70
## 11       2004   99
## 12       2005   93
## 13       2006  124
## 14       2007  223
## 15       2008  430
## 16       2009  345
## 17       2010   14
## 18 Don't Know  150
##        g5_03d Freq
## 1                1
## 2           1   50
## 3          10   59
## 4          11   74
## 5          12   72
## 6          13   56
## 7          14   82
## 8          15   70
## 9          16   91
## 10         17   76
## 11         18   52
## 12         19   63
## 13          2   77
## 14         20   67
## 15         21   72
## 16         22   69
## 17         23   76
## 18         24   62
## 19         25   59
## 20         26   73
## 21         27   53
## 22         28   76
## 23         29   47
## 24          3   62
## 25         30   65
## 26         31   32
## 27          4   64
## 28          5   77
## 29          6   75
## 30          7   65
## 31          8   71
## 32          9   70
## 33 Don't Know    6
##        g5_03m Freq
## 1    November  197
## 2       April  170
## 3      August  209
## 4    December  176
## 5  Don't Know    2
## 6    February  148
## 7     January  143
## 8        July  159
## 9        June  127
## 10      March  190
## 11        May  153
## 12    October  199
## 13  September  191
##        c1_01 Freq
## 1               2
## 2 Don't Know    1
## 3   Multiple   76
## 4  Singleton 1985
##           c1_02 Freq
## 1    Don't Know 1990
## 2         First   30
## 3        Second   43
## 4 Third or More    1
##   c1_03 Freq
## 1        946
## 2    No   70
## 3   Yes 1048
##        c1_04 Freq
## 1      After   65
## 2 Don't Know 1997
## 3     During    2
##                        c1_06a Freq
## 1                  Don't Know   15
## 2                        Home  740
## 3                    Hospital 1213
## 4 On Route to Health Facility   18
## 5                       Other    7
## 6       Other Health Facility   71
##                c1_07 Freq
## 1                       4
## 2      About average 1661
## 3         Don't Know   42
## 4  larger than usual  117
## 5 smaller than usual  144
## 6         Very small   96
##              c1_08a Freq
## 1                    374
## 2        Don't Know  477
## 3             Grams 1207
## 4 Refused to Answer    6
##               c1_09 Freq
## 1                     16
## 2        Don't Know    3
## 3            Female 1216
## 4              Male  828
## 5 Refused to Answer    1
##               c1_10 Freq
## 1                    399
## 2        Don't Know  313
## 3 Refused to Answer    1
## 4         Specified 1351
##        c1_10d Freq
## 1              325
## 2           1   59
## 3          10   80
## 4          11   58
## 5          12   62
## 6          13   36
## 7          14   41
## 8          15   47
## 9          16   51
## 10         17   63
## 11         18   55
## 12         19   53
## 13          2   57
## 14         20   50
## 15         21   34
## 16         22   58
## 17         23   39
## 18         24   42
## 19         25   39
## 20         26   47
## 21         27   51
## 22         28   46
## 23         29   43
## 24          3   59
## 25         30   41
## 26         31   31
## 27          4   58
## 28          5   60
## 29          6   46
## 30          7   54
## 31          8   58
## 32          9   63
## 33 Don't Know  158
##        c1_10m Freq
## 1              325
## 2    November   94
## 3       April  159
## 4      August  161
## 5    December  114
## 6  Don't Know  104
## 7    February  138
## 8     January  127
## 9        July  141
## 10       June  147
## 11      March  156
## 12        May  132
## 13    October  139
## 14  September  127
##        c1_10y Freq
## 1              325
## 2        1995    1
## 3        1996    3
## 4        1997   19
## 5        1998   69
## 6        1999   59
## 7        2000   59
## 8        2001   60
## 9        2002   63
## 10       2003   70
## 11       2004   99
## 12       2005   87
## 13       2006  121
## 14       2007  215
## 15       2008  432
## 16       2009  362
## 17       2010   16
## 18 Don't Know    4
##   c1_11 Freq
## 1        125
## 2 Alive 1936
## 3  Dead    3
##        c1_12 Freq
## 1             125
## 2 Don't Know   18
## 3         No   22
## 4        Yes 1899
##               c1_13 Freq
## 1                    125
## 2        Don't Know   15
## 3                No    3
## 4 Refused to Answer    1
## 5               Yes 1920
##        c1_14 Freq
## 1             126
## 2 Don't Know    7
## 3         No    2
## 4        Yes 1929
##   c1_15 Freq
## 1          4
## 2    No 2060
##   c1_16 Freq
## 1    No 2064
##   c1_17 Freq
## 1    No 2064
##   c1_18 Freq
## 1    No 2064
##   c1_19_1 Freq
## 1      No 2064
##   c1_19_2 Freq
## 1      No 2064
##   c1_19_3 Freq
## 1      No 2064
##   c1_19_4a Freq
## 1       No 2064
##   c1_19_5 Freq
## 1      No 2064
##   c1_19_6 Freq
## 1      No 2064
##                        c1_22a Freq
## 1                  Don't Know    6
## 2                        Home  137
## 3                    Hospital 1708
## 4 On Route to Health Facility   65
## 5                       Other  141
## 6       Other Health Facility    7
##        c1_24 Freq
## 1 Don't Know    1
## 2  Specified 2063
##        c1_24d Freq
## 1           1   48
## 2          10   54
## 3          11   70
## 4          12   70
## 5          13   53
## 6          14   75
## 7          15   65
## 8          16   86
## 9          17   73
## 10         18   47
## 11         19   58
## 12          2   74
## 13         20   62
## 14         21   67
## 15         22   64
## 16         23   70
## 17         24   55
## 18         25   55
## 19         26   71
## 20         27   49
## 21         28   73
## 22         29   44
## 23          3   56
## 24         30   57
## 25         31   30
## 26          4   60
## 27          5   77
## 28          6   71
## 29          7   62
## 30          8   69
## 31          9   65
## 32 Don't Know  134
##        c1_24m Freq
## 1    November  189
## 2       April  156
## 3      August  201
## 4    December  161
## 5  Don't Know  129
## 6    February  144
## 7     January  135
## 8        July  153
## 9        June  114
## 10      March  183
## 11        May  131
## 12    October  188
## 13  September  180
##       c1_24y Freq
## 1       1999    1
## 2       2001    1
## 3       2005    1
## 4       2007   42
## 5       2008  371
## 6       2009 1340
## 7       2010  180
## 8 Don't Know  128
##                 c1_26 Freq
## 1 28 days to 11 years 2064
##        c4_01 Freq
## 1 Don't Know    2
## 2         No  770
## 3        Yes 1292
##        c4_03 Freq
## 1 Don't Know    9
## 2         No 1053
## 3        Yes 1002
##        c4_04 Freq
## 1 Don't Know 1067
## 2       Mild   50
## 3   Moderate  314
## 4     Severe  633
##           c4_05 Freq
## 1    Continuous  584
## 2    Don't Know 1068
## 3    On and Off  388
## 4 Only at Night   24
##        c4_06 Freq
## 1 Don't Know   15
## 2         No 1404
## 3        Yes  645
##       c4_07a Freq
## 1 Don't Know 1446
## 2  Specified  618
##        c4_09 Freq
## 1 Don't Know    3
## 2         No 1799
## 3        Yes  262
##               c4_11 Freq
## 1        Don't Know    3
## 2                No 1997
## 3 Refused to Answer    1
## 4               Yes   63
##               c4_12 Freq
## 1                No 1504
## 2 Refused to Answer    3
## 3               Yes  557
##   c4_14 Freq
## 1    No 1826
## 2   Yes  238
##        c4_15 Freq
## 1 Don't Know    2
## 2         No 1874
## 3        Yes  188
##        c4_16 Freq
## 1 Don't Know    9
## 2         No  708
## 3        Yes 1347
##               c4_18 Freq
## 1        Don't Know   35
## 2                No 1177
## 3 Refused to Answer    1
## 4               Yes  851
##        c4_20 Freq
## 1 Don't Know   25
## 2         No 1406
## 3        Yes  633
##        c4_22 Freq
## 1 Don't Know   27
## 2         No 1701
## 3        Yes  336
##        c4_23 Freq
## 1 Don't Know   28
## 2         No 1597
## 3        Yes  439
##               c4_24 Freq
## 1        Don't Know   25
## 2                No 1761
## 3 Refused to Answer    2
## 4               Yes  276
##        c4_25 Freq
## 1 Don't Know    5
## 2         No 1489
## 3        Yes  570
##        c4_26 Freq
## 1 Don't Know   25
## 2         No 1398
## 3        Yes  641
##              c4_27 Freq
## 1         <6 hours  291
## 2 24 hours or more  201
## 3       6-23 hours  140
## 4       Don't Know 1432
##        c4_28 Freq
## 1 Don't Know   18
## 2         No 1841
## 3        Yes  205
##        c4_29 Freq
## 1 Don't Know   26
## 2         No 1956
## 3        Yes   82
##        c4_30 Freq
## 1 Don't Know    3
## 2         No 1893
## 3        Yes  168
##       c4_31_1 Freq
## 1  Don't Know 1902
## 2  Everywhere   85
## 3 Extremities   26
## 4        Face   18
## 5       Other    1
## 6       Trunk   32
##       c4_31_2 Freq
## 1  Don't Know 2059
## 2 Extremities    4
## 3       Trunk    1
##         c4_32 Freq
## 1  Don't Know 1912
## 2  Everywhere   21
## 3 Extremities   41
## 4        Face   46
## 5       Other    1
## 6       Trunk   43
##        c4_34 Freq
## 1 Don't Know    2
## 2         No 2028
## 3        Yes   34
##        c4_35 Freq
## 1 Don't Know    4
## 2         No 1803
## 3        Yes  257
##        c4_36 Freq
## 1 Don't Know    2
## 2         No 1781
## 3        Yes  281
##               c4_38 Freq
## 1        Don't Know    6
## 2                No 1938
## 3 Refused to Answer    2
## 4               Yes  118
##        c4_39 Freq
## 1 Don't Know    2
## 2         No 1991
## 3        Yes   71
##               c4_40 Freq
## 1        Don't Know    4
## 2                No 1642
## 3 Refused to Answer    1
## 4               Yes  417
##               c4_41 Freq
## 1        Don't Know   22
## 2                No 1273
## 3 Refused to Answer    1
## 4               Yes  768
##        c4_42 Freq
## 1 Don't Know   20
## 2         No 2009
## 3        Yes   35
##        c4_43 Freq
## 1 Don't Know   41
## 2         No 1847
## 3        Yes  176
##        c4_44 Freq
## 1 Don't Know    3
## 2         No 1770
## 3        Yes  291
##                                                                                                      c4_45
## 1                                                                                                         
## 2                                                                                                        2
## 3                                                         Accident ke bad nak kan i mooh se blood baha tha
## 4                                                                                adha gala kata, cenay per
## 5                                                             after accident blooding from mouth and nose.
## 6                                                                                                 all body
## 7                                                                                       ANUS, NOSE & MOUTH
## 8                                                 AT THE SIDE KAY GIBUTANGAN UG HOSE PARA MAKUHA ANG TUBIG
## 9                                                                      Bache ke honth se raktsrav hua tha.
## 10                                                                                          BACK SIDE HEAD
## 11                                                                                BLED FROM MOUTH AND NOSE
## 12                                                                                         BLED FROM NOSE.
## 13                                                                                         BLEED FROM NOSE
## 14                                                                                       BLEED FROM URINE.
## 15                                                                                           BLEED IN HEAD
## 16                                                                                          BLEED IN MOUTH
## 17                                                                                BLEED IN MOUTH AND NOSE.
## 18                                                                                           BLEED IN NOSE
## 19                                                                                 BLEED IN NOSE AND MOUTH
## 20                                                                                          BLEED IN URINE
## 21                                                                                           BLEED VOMITNG
## 22                                                                                         BLEED VOMITNGS.
## 23                                                                                      bleeding from head
## 24                                                                  Bleeding from nose after pushing chest
## 25                                                                     Bleeding from where he peed (penis)
## 26                                                                           BLEEDING IN MOUTH AND IN HEAD
## 27                                                                                       Bleeding in potty
## 28                                                                blodding from head cuse of force on head
## 29                                                                          blood from head after accident
## 30                                                                                      BLOOD IN THE URINE
## 31                                                                                         Blood Vomitting
## 32                                                                                          BLOOD WOMTINGS
## 33                                          Blooding from head after accident cause of deep injury in head
## 34                                                                                      Blooding from nose
## 35                                                                                     Blooding with urine
## 36                                                                                                    BODY
## 37                                                                                           BY THE MOUTH.
## 38                            cause of attract with scissor blooding from stomach. Its blooding is enough.
## 39                                                                                   CHEST, LEGS AND HANDS
## 40                                                                                           CORD BLEEDING
## 41                                               Dabne ke karan jaangh par lohe ki keel se khoon nikla tha
## 42                                                                       Death se kuch ghante pahle nak se
## 43                                                                death se pahle muh se khoon aa gaya tha.
## 44                                                   Deevar girnay kay karar naak say ractstrava howa tha.
## 45                                     Deevar may dabnay kay karagh kaan va Naak say Ract estrava howa tha
## 46                                                                                                     EAR
## 47                                                                                              EAR, HEAD.
## 48                                                                                                    ears
## 49                                                                                       EARS, NOSE, MOUTH
## 50                                                                            EXCRETED BLOODY STOOL (ANUS)
## 51                                                                                             EYER, MOUTH
## 52                                                                                             FACE, CHEST
## 53                                                                                              face, Neck
## 54                                                                                               Face, Sar
## 55                                                                                           FACE, STUMACK
## 56                                                                                                FOREHEAD
## 57                                                                                                foreskin
## 58                                                                                  Form back side of head
## 59                                                                                     From ear and throat
## 60                                                                                               From Head
## 61                                                                                               FROM HEAD
## 62                                                                    From head (Sar ke bal banane ke bad)
## 63                                                                                      From Head and Anus
## 64                                                                                     From head and dhadh
## 65                                                                                      From head and hand
## 66                                                                                  From Head, nose, mouth
## 67                                                                                          FROM HER MOUTH
## 68                                                             FROM HER OROPHARENGEAL TUBE, BLOOD CAME OUT
## 69                                                                                     FROM HIS TUBE (NGT)
## 70                                                                                           from left arm
## 71                                                                                              from mouth
## 72                                                                                              From mouth
## 73                                                                                              From Mouth
## 74                                                                                              FROM MOUTH
## 75                                                                                     From mouth and nose
## 76                                                                                              FROM NAILS
## 77                                                                                               From Nake
## 78                                                                                               from nose
## 79                                                                                               from Nose
## 80                                                                                               From Nose
## 81                                                         From Nose (At the time of putting pipe in nose)
## 82                                                                                 From Nose , Ear & Mouth
## 83                                                                                     from nose and mouth
## 84                                                                          from Nose because of nose pipe
## 85                                                                                   From nose, ear, mouth
## 86                                                                                            From stomach
## 87                                                                                             from temple
## 88                                                                                          from the mouth
## 89  From the mouth because of a lack of ability of the blood to clot (because he/she had a probe/catheter)
## 90                                                                                                Gala say
## 91                                                                                     Galay say, pate say
## 92                                                                                                    GUMS
## 93                                                                                                    HAND
## 94                                                                                          HANDS AND LEGS
## 95                                                                                Hath par sap ke katne se
## 96                                                                                                    HEAD
## 97                                                                               HEAD - VERY LITTLE AMOUNT
## 98                                                                                   HEAD (FROM HIS WOUND)
## 99                                                                    Head ke pichle hisse se ,nak mooh se
## 100                                                                               HEAD RIGHT SIDE(KANITHA)
## 101                                                                                        HEAD, NOSE, EAR
## 102                                                                                            heart, head
## 103                                                                  IN HER MOUTHE WHEN SHE WAS SUCTIONED.
## 104                                       Kaan,naak Kanpati say chaku marnay kay karar Ractstrava howa tha
## 105                                                                                               KICHWANI
## 106                                                                                    KWENYE CHOO KIKUBWA
## 107                                                                                           KWENYE ULIMI
## 108                                                                      Left arm, nose, right foot, mouth
## 109                                                                                               LEFT EAR
## 110                                                                                              LEFT HAND
## 111                                                                                                    LEG
## 112                                                                                                   LIPS
## 113                                                                                               MalMutra
## 114                                                                                            masoodhe se
## 115                                                                                                MDOMONI
## 116                                                                                    MGUUNI  NA  MKONONI
## 117                                                                                        Mounth & Latrin
## 118                                                                                                  mouth
## 119                                                                                                  Mouth
## 120                                                                                                  MOUTH
## 121                                                                                         Mouth and Nose
## 122                                                                                         MOUTH AND NOSE
## 123                                                                                         MOUTH CUT NOSE
## 124                                                                                      MOUTH, ANUS, NOSE
## 125                                                                                            Mouth, Neck
## 126                                                            Mouth, Neck, ear , up the leg , Ghutno saya
## 127                                                                                            MOUTH, NOSE
## 128                                                                                      MOUTH, NOSE, EAR.
## 129                                                                                        MOUTH,NOCE, EAR
## 130                                                                                            MOUTH; NOSE
## 131                                                                                          naak say khun
## 132                                                                                                   NECK
## 133                                                                                       NOCE, MOUTH,ANUS
## 134                                                                                                   NOLE
## 135                                                                                                   nose
## 136                                                                                                   Nose
## 137                                                                                                   NOSE
## 138                                                                                    NOSE - GAMAY RA DAW
## 139                                                                                           NOSE & MOUTH
## 140                                                                                         Nose and mouth
## 141                                                                                         Nose and Mouth
## 142                                                                                         NOSE AND MOUTH
## 143                                                                                             NOSE MOUTH
## 144                                                                                    NOSE WHEN SUCTIONED
## 145                                                                                              Nose, ear
## 146                                                                                       NOSE, EAR, MOUTH
## 147                                                                                            NOSE, MOUTH
## 148                                                            NOSE, MOUTH FROM STAMAC BLOOD WAS BLEEDING.
## 149                                                                                           NOSE, MOUTH.
## 150                                                                                                on head
## 151                                                                                                On Head
## 152                                                  ONE LAST DAY URINATED BLOOD FROM  PENIS (BLOOD URINE)
## 153                                                                          only neck say Balgum kay Sath
## 154                                                                                  paav kay Anguthay say
## 155                                                                                               PAAV may
## 156                                                                                                  PUANI
## 157                                                                              PUANI MDOMONI KUHARA DAMU
## 158                                                                                         Pure sarir may
## 159                                                                              rectum which had a fisure
## 160                                                                                              RIGHT LEG
## 161                                  RIGHT UPPER EXTERMETY, BOTH LOWER EXTERMETY (DECEASED PHOTO ATTACHED)
## 162            Saans band hone par nak se raktshrav hua tha tab pamp dvara saans ke lene me sahayta ki gai
## 163                                                 Saap kay katanay kay baad naak say ractstrava howa tha
## 164                                                       Sanp katne ke bad nak se halka raktshrav hua tha
## 165                   Sanp ne dahine pair ki ungli me kata tha katne ke paschat ungli se raktshrav hua tha
## 166                                                                                           Sar , Jaghan
## 167                                                                                                sar say
## 168                                                                                 Sar say Sinay Pat say,
## 169                                                                                   SEHEMU YA HAJA KUBWA
## 170                                                                                                   Seir
## 171                                                         Seir may chote laganay say RactStrava howa tha
## 172                                                                            Seir say , Kamer kay nechey
## 173                                                        seirsay, sinaysay bahay say, jangh say pate per
## 174                                                         sharir kay vibhin Hisso say (wankay say Hatya)
## 175                                                                               SOMEWHERE IN THE STOMACH
## 176                                                                                                STOMACH
## 177                                                                        stomach where the operation was
## 178                                         Tanduya kay hamlay kay baad sar , galy say ractstrav huwa tha.
## 179                                                                                          Tatti ke sath
## 180                                                                                                TUMBONI
## 181                                                                                                   Ulti
## 182                                                                                                  URINE
## 183                                                                               URINE WITH BLOOD (FRESH)
## 184                                                                                                VAGINAL
## 185                                               VEGINAL BLEEDING ( BECAUSE OF HURT IN THE LOWER ABDOMEN.
## 186                                                 Visfoot kay uprant poray sarir say ractstrava howa tha
## 187                                                                               VOMITS OUT BLOOD (MOUTH)
## 188                                                                                       with Black Potty
##     Freq
## 1   1777
## 2      1
## 3      1
## 4      1
## 5      1
## 6      1
## 7      1
## 8      1
## 9      1
## 10     1
## 11     1
## 12     1
## 13     2
## 14     1
## 15     1
## 16     5
## 17     1
## 18     2
## 19     1
## 20     1
## 21     1
## 22     1
## 23     1
## 24     1
## 25     1
## 26     1
## 27     1
## 28     1
## 29     1
## 30     1
## 31     1
## 32     1
## 33     1
## 34     2
## 35     1
## 36     1
## 37     1
## 38     1
## 39     1
## 40     1
## 41     1
## 42     1
## 43     1
## 44     1
## 45     1
## 46     1
## 47     1
## 48     1
## 49     1
## 50     1
## 51     1
## 52     1
## 53     1
## 54     1
## 55     1
## 56     1
## 57     1
## 58     1
## 59     1
## 60     2
## 61     2
## 62     1
## 63     1
## 64     1
## 65     1
## 66     1
## 67     1
## 68     1
## 69     1
## 70     1
## 71     3
## 72     2
## 73     2
## 74     1
## 75     2
## 76     1
## 77     2
## 78     3
## 79     2
## 80     2
## 81     1
## 82     1
## 83     2
## 84     1
## 85     1
## 86     2
## 87     1
## 88     1
## 89     1
## 90     1
## 91     1
## 92     1
## 93     1
## 94     1
## 95     1
## 96    11
## 97     1
## 98     1
## 99     1
## 100    1
## 101    1
## 102    1
## 103    1
## 104    1
## 105    1
## 106    1
## 107    1
## 108    1
## 109    1
## 110    1
## 111    1
## 112    1
## 113    1
## 114    1
## 115    7
## 116    1
## 117    1
## 118    3
## 119    3
## 120   24
## 121    2
## 122    2
## 123    1
## 124    1
## 125    2
## 126    1
## 127    2
## 128    1
## 129    1
## 130    1
## 131    1
## 132    1
## 133    1
## 134    1
## 135    2
## 136    4
## 137   18
## 138    1
## 139    1
## 140    1
## 141    1
## 142    7
## 143    1
## 144    1
## 145    1
## 146    1
## 147    3
## 148    1
## 149    1
## 150    1
## 151    1
## 152    1
## 153    1
## 154    1
## 155    1
## 156    1
## 157    1
## 158    1
## 159    1
## 160    1
## 161    1
## 162    1
## 163    1
## 164    1
## 165    1
## 166    1
## 167    1
## 168    1
## 169    1
## 170    2
## 171    1
## 172    1
## 173    1
## 174    1
## 175    1
## 176    1
## 177    1
## 178    1
## 179    1
## 180    1
## 181    1
## 182    3
## 183    1
## 184    1
## 185    1
## 186    1
## 187    1
## 188    1
##               c4_46 Freq
## 1        Don't Know    4
## 2                No 1879
## 3 Refused to Answer    1
## 4               Yes  180
##   c4_47_1 Freq
## 1      No 1972
## 2     Yes   92
##   c4_47_2 Freq
## 1      No 1985
## 2     Yes   79
##   c4_47_3 Freq
## 1      No 1985
## 2     Yes   79
##   c4_47_4 Freq
## 1      No 2045
## 2     Yes   19
##   c4_47_5 Freq
## 1      No 2004
## 2     Yes   60
##   c4_47_6 Freq
## 1      No 2007
## 2     Yes   57
##   c4_47_7 Freq
## 1      No 2010
## 2     Yes   54
##   c4_47_8a Freq
## 1       No 1996
## 2      Yes   68
##                                c4_47_8b Freq
## 1                                       1990
## 2             ACCEDENTAL ELECTRIC SHOCK    1
## 3            ACCIDENTAL ELECTRICAL BURN    1
## 4            ACCIDENTAL ELECTRICS SHOCK    1
## 5                           BLOCK FALLS    1
## 6                            BOIL WATER    1
## 7          BOIL WATER FALL DOWN ON BODY    1
## 8                            BOMB BLAST    2
## 9                             Bum Blast    1
## 10                  BURN FROM HOT WATER    1
## 11                  BURNED BY HOT WATER    1
## 12          Chakkar Khakar Gir Gaya Tha    1
## 13                     Chapper Girne Se    1
## 14                         CURENT SHOCK    1
## 15                        CURRENT SHOCK    2
## 16            DAL FALL DOWN ON THE BODY    1
## 17                Dale Me Dab Gaya Tha.    1
## 18                 Death From Fall Wall    1
## 19      Deevar Gornay Say Dubkar Mrathu    1
## 20 Deevar Kay Nechay Dabnay Say Mratthu    1
## 21               Deewar Se Niche Dabkar    1
## 22         Divar Girne Par Usme Dabkar.    1
## 23                      Diwar Me Dabkar    1
## 24                          DOG'S BITE.    1
## 25                       ELECTIRC SHOCK    1
## 26                       ELECTRIC SHOCK    3
## 27                     ELECTRICAL SHOCK    2
## 28                      FALL DOWN STONE    1
## 29                       FALL DOWN WALL    1
## 30               FALLEN OF WALL ON BUDY    1
## 31                         Falling Wall    2
## 32                       FELL DOWN GATE    1
## 33         GATE FELL DOWN ON CHILD BODY    1
## 34              Ghar Girne Se Death Hui    1
## 35                               HAPANA    2
## 36                  HAPANA                 1
## 37           HEAD HIT BY A FALLING TREE    1
## 38                          HEAD INJURY    3
## 39                     HIT BY A COCONUT    1
## 40                            HOT WATER    1
## 41                      HOT WATER BURNS    1
## 42             HOT WATER FALLON ON BABY    1
## 43                      INJURY TO GROIN    1
## 44                  KAKANYANGA KICHOMVI    1
## 45                          Karant Laga    1
## 46                  KUANGUKIWA NA MNAZI    1
## 47                           LAND FALLS    1
## 48                           LEG INJURY    1
## 49                Mitti Se Niche Dabkar    3
## 50                        PRICK OF NAIL    1
## 51                ROAD TRAFFIC ACCIDENT    1
## 52                Self Shouting Mistake    1
## 53                SINKING IN THE TOILET    1
## 54                SPILLING OF HOT WATER    1
## 55                  SWALOD IRON TABLETS    1
## 56                            SWELLINGS    1
## 57                  T.V  FELLED ON FACE    1
## 58                T.V.FALL DOWN ON HEAD    1
## 59            T.V.FALL DOWN ON THE HEAD    1
## 60              TREE BRANCH FELL ON HIM    1
## 61                      Tv Fall On Head    1
## 62                           Under Wall    2
## 63                    UVIMBE WA SINDANO    1
##   c4_47_9 Freq
## 1      No 2040
## 2     Yes   24
##   c4_47_10 Freq
## 1       No 2035
## 2      Yes   29
##   c4_47_11 Freq
## 1       No 1511
## 2      Yes  553
##        c4_48 Freq
## 1 Don't Know   51
## 2         No 1938
## 3        Yes   75
##   c5_01 Freq
## 1    No  281
## 2   Yes 1783
##   c5_02_1 Freq
## 1      No 2009
## 2     Yes   55
##   c5_02_2 Freq
## 1      No 2001
## 2     Yes   63
##   c5_02_3 Freq
## 1      No 2055
## 2     Yes    9
##   c5_02_4 Freq
## 1      No  347
## 2     Yes 1717
##   c5_02_5 Freq
## 1      No 1791
## 2     Yes  273
##   c5_02_6 Freq
## 1      No 1864
## 2     Yes  200
##   c5_02_7 Freq
## 1      No 2032
## 2     Yes   32
##   c5_02_8 Freq
## 1      No 2061
## 2     Yes    3
##   c5_02_9 Freq
## 1      No 1582
## 2     Yes  482
##   c5_02_10 Freq
## 1       No 2043
## 2      Yes   21
##   c5_02_11a Freq
## 1        No 2046
## 2       Yes   18
##   c5_02_12 Freq
## 1       No 2063
## 2      Yes    1
##   c5_02_13 Freq
## 1       No 2064
##   c5_02_14 Freq
## 1       No 2064
##               c5_04 Freq
## 1                    282
## 2        Don't Know    4
## 3                No 1544
## 4 Refused to Answer    1
## 5               Yes  233
##   c5_05 Freq
## 1    No 1972
## 2   Yes   92
##      c5_06_1d Freq
## 1           1    2
## 2          10    4
## 3          11    4
## 4          12    2
## 5          13    3
## 6          14    4
## 7          15    3
## 8          16    2
## 9          17    1
## 10         18    2
## 11         19    4
## 12          2    3
## 13         20    4
## 14         21    1
## 15         22    2
## 16         23    3
## 17         24    3
## 18         25    1
## 19         26    5
## 20         27    4
## 21         28    3
## 22         29    2
## 23          3    2
## 24         31    1
## 25          4    4
## 26          5    2
## 27          6    3
## 28          7    3
## 29          8    7
## 30          9    4
## 31 Don't Know 1976
##      c5_06_1m Freq
## 1    November   11
## 2       April    4
## 3      August    7
## 4    December    4
## 5  Don't Know 1976
## 6    February   11
## 7     January    6
## 8        July   11
## 9        June    2
## 10      March    6
## 11        May    5
## 12    October    8
## 13  September   13
##     c5_06_1y Freq
## 1       2006    1
## 2       2007    5
## 3       2008   14
## 4       2009   58
## 5       2010   10
## 6 Don't Know 1976
##      c5_06_2d Freq
## 1          10    3
## 2          11    3
## 3          12    2
## 4          14    2
## 5          17    1
## 6          18    1
## 7          20    1
## 8          21    2
## 9          22    1
## 10         23    3
## 11         24    2
## 12         25    2
## 13         26    2
## 14         27    1
## 15         28    6
## 16          3    5
## 17         31    2
## 18          4    2
## 19          5    1
## 20          7    2
## 21          8    2
## 22 Don't Know 2018
##      c5_06_2m Freq
## 1    November    6
## 2       April    3
## 3      August    4
## 4    December    4
## 5  Don't Know 2018
## 6    February    4
## 7     January    5
## 8        July    2
## 9        June    3
## 10      March    8
## 11        May    1
## 12    October    4
## 13  September    2
##     c5_06_2y Freq
## 1       2007    1
## 2       2008    7
## 3       2009   29
## 4       2010    7
## 5 Don't Know 2020
##        c5_08d Freq
## 1           1    1
## 2          10    3
## 3          11    4
## 4          12    2
## 5          13    1
## 6          14    6
## 7          15    3
## 8          17    1
## 9          18    1
## 10         19    3
## 11          2    1
## 12         20    1
## 13         21    2
## 14         22    1
## 15         23    4
## 16         24    3
## 17         25    1
## 18         26    5
## 19         27    1
## 20         28    6
## 21          3    9
## 22         30    1
## 23         31    1
## 24          4    4
## 25          5    3
## 26          7    4
## 27          8    5
## 28          9    3
## 29 Don't Know 1984
##        c5_08m Freq
## 1    November    7
## 2       April    5
## 3      August    9
## 4    December    4
## 5  Don't Know 1984
## 6    February   11
## 7     January    5
## 8        July    7
## 9        June    5
## 10      March    8
## 11        May    2
## 12    October    8
## 13  September    9
##       c5_08y Freq
## 1       2007    3
## 2       2008   12
## 3       2009   55
## 4       2010   10
## 5 Don't Know 1984
##        c5_10 Freq
## 1             108
## 2 Don't Know   45
## 3         No  858
## 4        Yes 1053
##   c5_11 Freq
## 1    No 1758
## 2   Yes  306
##               c5_17 Freq
## 1                    104
## 2        Don't Know  184
## 3                No  942
## 4 Refused to Answer    4
## 5               Yes  830
##               c5_18 Freq
## 1        Don't Know   17
## 2                No 2020
## 3 Refused to Answer    2
## 4               Yes   25
##               c5_19 Freq
## 1                    236
## 2        Don't Know  140
## 3                No 1654
## 4 Refused to Answer    5
## 5               Yes   29

Columns further need to be removed
module: remove (same value for all observations)
c1_02: remove, as only applies to multiple births (info is covered by c1_01 already)
c1_04: remove, as it only applies to cases where moms were dead (and info is covered in c1_03) c1_05: remove, as only applies to cases where moms were dead c1_08a: remove since only showing units (may refer to c1_08b for detailed values)
c1_10,c1_10d,c1_10m,c1_10y: remove, duplicated with g5_01 c1_11: remove, since we only want to focus on children who shouldn’t have been dead at birth
c1_15: remove, since there’s only one class
c1_16: remove, since there’s only one class
c1_17: remove, since there’s only one class
c1_18: remove, since there’s only one class
c1_19_1 to c1_19_6: remove, since there’s only one class
c1_24: remove, since only showing units
c1_24d,c1_24m,c1_24y: remove, duplicated with g5_03
c1_26: remove, since there’s only one class
c4_07a: remove, since only showing units
c4_31_1: remove, since only applies to those with rash (information already there in c4_30)
c4_31_2: remove, as only applies to the ones who developed rash (info already covered in c4_30)
c4_32: remove, as only applies to the ones who developed rash (info already covered in c4_30)
c4_45: remove, since most of the value missing
c4_47_8b: remove, since most of the value missing
c5_02_13: remove, since there’s only one class
c5_02_14: remove, since there’s only one class c5_06_2m,c5_06_2d,c5_06_2y: remove, most of the data is don’t know/missing c5_08m,c5_08d,c5_08y: remove, most of the data is don’t know/missing

df_va <- subset(df_va,select=-c(c1_02,c1_04,c1_05,c1_08a,c1_10,c1_10d,c1_10m,c1_10y,c1_11,c1_15,c1_16,c1_17,c1_18,c1_19_1,c1_19_2,c1_19_3,c1_19_4a,c1_19_4b,c1_19_5,c1_19_6,c1_24,c1_24d,c1_24m,c1_24y,c1_26,c4_07a,c4_31_1,c4_31_2,c4_32,c4_45,c4_47_8b,c5_02_13,c5_02_14,c5_06_2m,c5_06_2d,c5_06_2y,c5_08m,c5_08d,c5_08y))

Columns need further processed
g5_01d,g5_01m,g5_01y: compute DOB and then remove
g5_03d,g5_03m,g5_03y: compute DOD and then remove

# Compute DOB
df_va$DOB_month <- match(df_va$g5_01m,month.name)
df_va$DOB_str <- paste(df_va$g5_01y,"-",df_va$DOB_month,"-",df_va$g5_01d)
df_va$DOB <- as.POSIXct(df_va$DOB_str, format="%Y - %m - %d",tz="UTC")
df_va <- subset(df_va,select=-c(DOB_str,DOB_month))
# Compute DOD
df_va$DOD_month <- match(df_va$g5_03m,month.name)
df_va$DOD_str <- paste(df_va$g5_03y,"-",df_va$DOD_month,"-",df_va$g5_03d)
df_va$DOD <- as.POSIXct(df_va$DOD_str, format="%Y - %m - %d",tz="UTC")
df_va <- subset(df_va,select=-c(DOD_str,DOD_month))
df_va <- subset(df_va,select=-c(g5_01d,g5_01m,g5_01y,g5_03d,g5_03m,g5_03y))

c1_22a: combine “hospital” with “other health facility” to “Health facility”

df_va$c1_22a <- as.character(df_va$c1_22a)
df_va$c1_22a[df_va$c1_22a=="Hospital"] <- "Health Facility"
df_va$c1_22a[df_va$c1_22a=="Other Health Facility"] <- "Health Facility"
df_va$c1_22a <- as.factor(df_va$c1_22a)

c5_06_1d,c5_06_1m,c5_06_1y: remove, as most of the records are missing

df_va <- subset(df_va,select=-c(c5_06_1d,c5_06_1m,c5_06_1y))

Imputation and removal for missing values
Check missing values for numeric variables

df_va_num <- select_if(df_va,is.numeric)
summary(df_va_num)
##      g4_06            g4_07            g5_04a           g5_04b      
##  Min.   : 1.000   Min.   : 0.000   Min.   : 1.000   Min.   : 1.000  
##  1st Qu.: 3.000   1st Qu.: 1.000   1st Qu.: 2.000   1st Qu.: 2.000  
##  Median : 5.000   Median : 2.000   Median : 4.000   Median : 5.000  
##  Mean   : 5.225   Mean   : 2.436   Mean   : 4.855   Mean   : 5.131  
##  3rd Qu.: 6.000   3rd Qu.: 3.000   3rd Qu.: 8.000   3rd Qu.: 8.000  
##  Max.   :30.000   Max.   :15.000   Max.   :19.000   Max.   :12.000  
##  NA's   :1        NA's   :1        NA's   :792      NA's   :1280    
##      g5_04c         c1_08b         c1_20            c1_21        
##  Min.   :28.0   Min.   :   2   Min.   :   0.0   Min.   :   0.00  
##  1st Qu.:28.0   1st Qu.:2500   1st Qu.:   0.0   1st Qu.:   1.00  
##  Median :29.0   Median :2800   Median :  90.0   Median :   7.00  
##  Mean   :28.6   Mean   :2798   Mean   : 564.4   Mean   :  37.15  
##  3rd Qu.:29.0   3rd Qu.:3100   3rd Qu.: 365.0   3rd Qu.:  20.00  
##  Max.   :29.0   Max.   :9999   Max.   :4015.0   Max.   :4015.00  
##  NA's   :2059   NA's   :485                                      
##      c1_25            c4_02             c4_07b           c4_08       
##  Min.   :   0.0   Min.   :  0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:   0.0   1st Qu.:  0.000   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median :  90.0   Median :  2.000   Median : 0.000   Median : 0.000  
##  Mean   : 559.2   Mean   :  5.587   Mean   : 1.839   Mean   : 1.701  
##  3rd Qu.: 365.0   3rd Qu.:  7.000   3rd Qu.: 3.000   3rd Qu.: 0.000  
##  Max.   :4015.0   Max.   :240.000   Max.   :30.000   Max.   :98.000  
##                                                                      
##      c4_10             c4_13             c4_17             c4_19        
##  Min.   : 0.0000   Min.   :  0.000   Min.   :  0.000   Min.   :  0.000  
##  1st Qu.: 0.0000   1st Qu.:  0.000   1st Qu.:  0.000   1st Qu.:  0.000  
##  Median : 0.0000   Median :  0.000   Median :  1.000   Median :  0.000  
##  Mean   : 0.3203   Mean   :  3.708   Mean   :  5.487   Mean   :  2.755  
##  3rd Qu.: 0.0000   3rd Qu.:  1.000   3rd Qu.:  4.000   3rd Qu.:  1.000  
##  Max.   :30.0000   Max.   :300.000   Max.   :740.000   Max.   :740.000  
##                                                                         
##      c4_33             c4_37             c4_49             c5_07_1       
##  Min.   : 0.0000   Min.   : 0.0000   Min.   :  0.0000   Min.   :    0.0  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.:  0.0000   1st Qu.:    0.0  
##  Median : 0.0000   Median : 0.0000   Median :  0.0000   Median :    0.0  
##  Mean   : 0.6192   Mean   : 0.9525   Mean   :  0.8075   Mean   :  257.1  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.:  0.0000   3rd Qu.:    0.0  
##  Max.   :98.0000   Max.   :98.0000   Max.   :150.0000   Max.   :12000.0  
##                                                                          
##     c5_07_2       
##  Min.   :    0.0  
##  1st Qu.:    0.0  
##  Median :    0.0  
##  Mean   :  235.9  
##  3rd Qu.:    0.0  
##  Max.   :13200.0  
## 

Remove c5_07_1 and c5_07_1 since most of them are 0 (missing in this case)

df_va <- subset(df_va,select=-c(c5_07_1,c5_07_2))

c1_08b: re-code “9999” as NA

df_va$c1_08b[df_va$c1_08b==9999] <- NA

Drop g5_04a,g5_04b,g5_04c and use DOB and DOD to compute the age of death (in years)

df_va <- subset(df_va,select=-c(g5_04a,g5_04b,g5_04c))
df_va$age_death <- as.numeric(difftime(df_va$DOD, df_va$DOB, units = "days"))/365
library(naniar)
vis_miss(df_va_num)

split training and test set

library(caTools)
set.seed(123)
split = sample.split(df_va$site, SplitRatio = 0.7)
training_set = subset(df_va, split == TRUE)
test_set = subset(df_va, split == FALSE)

Deal with “Don’t know”, “Refuse to answer” and original missing value for all the categorical variables
1) If the sum of counts of all of them three is >=10: re-code all of them with “Don’t know” as a group itself
2) If the sum of counts of all of them three is <10: re-code all of them as missing value

# Define function impute_cat
impute_cat <- function(df_va){
  cat_num=c()
  for (i in 1:ncol(df_va)){
    if(is.factor(df_va[,i])){
      cat_num <- append(cat_num,i)} # Compute the positions of the columns where the variable is categorical
    }
  for (i in cat_num){
    count_dk <- sum(df_va[,i]=="Don't Know",na.rm=TRUE)
    count_rta <- sum(df_va[,i]=="Refused to Answer",na.rm=TRUE)
    count_missing <- sum(is.na(df_va[,i]),na.rm=FALSE)
    count_whitespace <- sum(df_va[,i]=="",na.rm=TRUE)
    count_all <- count_dk+count_rta+count_missing+count_whitespace
    df_va[,i] <- as.character(df_va[,i])
    if (count_all==10||count_all>15){
      if (count_rta>0){
        df_va[,i][df_va[,i]=="Refused to Answer"] <- "Don't Know"
        }
      if (count_whitespace>0){
        df_va[,i][df_va[,i]==""] <- "Don't Know"
        }
      if (count_missing>0){
        df_va[,i][is.na(df_va[,i])] <- "Don't Know"
        }
      }else{
        if (count_rta>0){
          df_va[,i][df_va[,i]=="Refused to Answer"] <- NA
          }
        if (count_dk>0){
          df_va[,i][df_va[,i]=="Don't Know"] <- NA
          }
        if (count_whitespace>0){
          df_va[,i][df_va[,i]==""] <- NA
        }
      }
    df_va[,i] <- as.factor(df_va[,i])
  }
  return(df_va)
}
# apply impute_cat on both training set and test set
training_set <- impute_cat(training_set)
test_set <- impute_cat(test_set)

Impute missing values of c1_08b with mean of the data by different site groups (fit on training set and transform both sets)

tapply(training_set$c1_08b,training_set$site, mean, na.rm=TRUE)
##       AP    Bohol      Dar   Mexico    Pemba       UP 
## 2710.312 2971.918 2963.937 2554.524 3058.375 2715.539
correct_weight <- function(x1, x2){
  if(is.na(x1)){
    if(x2=="AP"){
      return(2710)
      }
    else if(x2=="Bohol"){
      return(2972)
      }
    else if (x2=="Dar"){
      return(2964)
    }
    else if (x2=="Mexico"){
      return(2555)
    }
    else if (x2=="Pemba"){
      return(3058)
    }
    else{return(2716)}
  }else{
    return(x1)}
}
training_set$c1_08b <- apply(training_set[,c("c1_08b","site")], 1, function(x) correct_weight(x[1],x[2]))
test_set$c1_08b <- apply(test_set[,c("c1_08b","site")], 1, function(x) correct_weight(x[1],x[2]))

Remove all the remaining missing data for both sets

training_set <- training_set[complete.cases(training_set),]
test_set <- test_set[complete.cases(test_set),]

<<<<<<< Updated upstream

Visualization for EDA

# Change colname names for training_set_copy
library(ggpubr)
library(plyr)
training_set_copy <- training_set
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("g4_06","g4_07","g4_08"), to=c("num_people_live_at_address","num_rooms_in_household","separate_room_for_cooking"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c1_09","c1_01","c1_03"), to=c("Gender","Singleton_or_Multiple_Birth","Mother_Living_or_Deceased"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c1_06a","c1_07","c1_08b","c1_12","c1_13"), to=c("Location_of_Birth","Size_at_Birth","Weight_at_Birth","Did_the_Baby_Cry","Did_the_Baby_Move"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("num_people_live_at_address","num_rooms_in_household","separate_room_for_cooking"), to=c("Num_People_Live_at_Address","Num_Rooms_in_Household","Separate_Room_for_Cooking"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c1_14","c1_20","c1_21","c1_22a","c1_25","c4_01","c4_02","c4_03"), to=c("Did_the_Baby_Breathe","Age_at_Onset_of_Illness","Duration_of_Illness", "Location_of_Death","Age_at_Time_of_Death","Fever_During_Illness","Duration_of_Fever_in_Days","Did_the_Fever_Continue_to_Death"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c4_04","c4_05","c4_06","c4_07b","c4_08","c4_09","c4_10","c4_11","c4_12","c4_13","c4_14","c4_15","c4_16","c4_17","c4_18", "c4_19"), to=c("Severity_of_Fever","Fever_Pattern","Loose_Liquid_Stool", "Highest_Num_Loose_Stool_per_Day_During_Illness","Num_Days_Before_Death_Loose_Stool_Began","Loose_Stool_Cont_Until_Death","Num_Days_Before_Death_Loose_Stool_Stopped","Blood_in_Stool", "Cough_During_Illness", "Duration_of_Cough", "Severity_of_Cough", "Vomitus_after_Coughing", "Difficulty_Breathing", "Duration_of_Difficulty_Breathing", "Fast_Breathing", "Duration_of_Fast_Breathing"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c4_20", "c4_22", "c4_23", "c4_24", "c4_25", "c4_26","c4_27", "c4_28", "c4_29", "c4_30", "c4_33", "c4_34", "c4_35", "c4_36", "c4_37", "c4_38", "c4_39","c4_40", "c4_41", "c4_42"), to=c("Indrawing_of_Chest", "Breathing_Stridor", "Breathing_Grunting", "Breathing_Wheezing", "Convulsions", "Loss_of_Consciousness", "Duration_Before_Death_LOC_Occurred", "Stiff_Neck", "Bulging_Fontanelle", "Skin_Rash", "Duration_of_Rash", "Blisters_Present_in_Rash", "Limbs_Become_Thin", "Swollen_Legs_or_Feet", "Duration_of_Swelling", "Skin_Flake_Off_in_Patches", "Hair_Color_Change_to_Red_Yellow", "Protruding_Belly", "Pallor_or_Lack_of_Blood", "Swelling_in_Armpits"))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c4_43","c4_44","c4_46", "c4_47_1","c4_47_2", "c4_47_3", "c4_47_4", "c4_47_5","c4_47_6","c4_47_7","c4_47_8a","c4_47_9","c4_47_10","c4_47_11"), to=c("Whitish_Rash_in_Mouth", "Bleeding_Seen","Skin_Turned_Black", "Suffered_Road_Traffic_Injury", "Suffered_a_Fall", "Suffered_Drowning", "Suffered_Poisoning", "Suffered_Bite_Sting", "Suffered_Burn_Fire", "Victim_of_Violence", "Other_Injury", "Unsure_if_Injury_Occurred","Refused_to_Answer_if_Deceased_Suffered_Injury", "Did_Not_Suffer_Injury" ))
colnames(training_set_copy)<- mapvalues(colnames(training_set_copy), from=c("c4_48","c4_49","c5_01","c5_02_1","c5_02_2","c5_02_3","c5_02_4","c5_02_5","c5_02_6","c5_02_7","c5_02_8","c5_02_9", "c5_02_10", "c5_02_11a","c5_02_12","c5_04", "c5_05","c5_10", "c5_11", "c5_17", "c5_18", "c5_19"), to=c("Injury_Intentionally_Inflicted_by_Someone","Days_Survived_After_Injury","Sought_Care_While_Ill", "Care_Sought_Traditional_Healer", "Care_Sought_Homeopath", "Care_Sought_Religious_Leader", "Care_Sought_Governmental_Hospital", "Care_Sought_Governmental_Health_Center_Clinic", "Care_Sought_Private_Hospital", "Care_Sought_Community_Based_Practioner", "Care_Sought_Trained_Birth_Attendant", "Care_Sought_Private_Physician", "Care_Sought_Pharmacy", "Care_Sought_Other_Provider", "Care_Sought_Relative_Friend", "Health_Records_for_Deceased", "Granted_Access_to_Health_Records", "Death_Certificate_Issued", "Granted_Access_to_Death_Certificate", "Mother_Ever_Tested_for_HIV", "Mother_HIV_Positive", "Mother_AIDS_Positive"))
library(ggplot2)

Plot distribution of all the numerical variables with histogram

hist_list <- list()
z <- 0
for (i in 1:ncol(training_set_copy)){
  if(is.numeric(training_set_copy[,i])){
    z <- z+1
    hist <- ggplot(data=training_set_copy,aes_string(names(training_set_copy)[i]))+
      geom_histogram(fill='lightblue',color='black')+
      labs(x=colnames(training_set_copy)[i],y= "Frequency")+
      theme(axis.text=element_text(size=12),
        axis.title=element_text(size=8))
    hist_list[[z]] <- hist
  }
}
ggarrange(plotlist=hist_list,ncol = 2)
## $`1`

## 
## $`2`

## 
## $`3`

## 
## $`4`

## 
## $`5`

## 
## $`6`

## 
## $`7`

## 
## $`8`

## 
## attr(,"class")
## [1] "list"      "ggarrange"

Plot bar charts for all the categorical variables

library(scales)
bar_list <- list()
z <- 0
for (i in 1:ncol(training_set_copy)){
  if(is.factor(training_set_copy[,i])){
    z <- z+1
    class <- training_set_copy %>% 
     group_by((training_set_copy)[i]) %>%
     dplyr::summarise(number=n()) %>%
     mutate(percentage=percent(number/sum(number))) %>% 
     mutate(pos = cumsum(number)- number/1.5) #Create a variable to indicate the position of labels 
    bar <- ggplot(data=class,aes_string(x=colnames(class)[1],y=colnames(class)[2]))+
      geom_bar(fill='lightblue',color='black',stat = "identity")+
      labs(x=colnames(training_set_copy)[i],y= "Count")+
      geom_text(aes(label=paste(number,"(",percentage,")")),vjust=-0.5,size=2,color="red")+
      theme(axis.text=element_text(size=8,angle=45,hjust=1),axis.title=element_text(size=8))
    bar_list[[z]] <- bar
  }
}
# Cause of death have many bins so I plot it separately
bar_list[[2]]

ggarrange(plotlist=bar_list[-2],ncol=2)
## $`1`

## 
## $`2`

## 
## $`3`

## 
## $`4`

## 
## $`5`

## 
## $`6`

## 
## $`7`

## 
## $`8`

## 
## $`9`

## 
## $`10`

## 
## $`11`

## 
## $`12`

## 
## $`13`

## 
## $`14`

## 
## $`15`

## 
## $`16`

## 
## $`17`

## 
## $`18`

## 
## $`19`

## 
## $`20`

## 
## $`21`

## 
## $`22`

## 
## $`23`

## 
## $`24`

## 
## $`25`

## 
## $`26`

## 
## $`27`

## 
## $`28`

## 
## $`29`

## 
## $`30`

## 
## $`31`

## 
## $`32`

## 
## $`33`

## 
## $`34`

## 
## $`35`

## 
## $`36`

## 
## $`37`

## 
## $`38`

## 
## attr(,"class")
## [1] "list"      "ggarrange"
# Top 5 cause of death for the whole dataset: Pneumonia, Diarrhea/Dysentery,Other Defined Causes of Child Deaths, Sepsis, Malaria
top_5 <- c("Pneumonia","Diarrhea/Dysentery","Other Defined Causes of Child Deaths", "Sepsis", "Malaria")
training_set_copy_top5 <- training_set_copy[training_set_copy$gs_text34 %in% top_5,]
# Remove classes with 0 cases
for (i in 1:length(names(training_set_copy_top5))){
  if (is.factor(training_set_copy_top5[,i])){
    training_set_copy_top5[,i] <- as.character(training_set_copy_top5[,i])
    training_set_copy_top5[,i] <- as.factor(training_set_copy_top5[,i])
  }
}
table(training_set_copy_top5$gs_text34)
## 
##                   Diarrhea/Dysentery                              Malaria 
##                                  108                                   45 
## Other Defined Causes of Child Deaths                            Pneumonia 
##                                   78                                  221 
##                               Sepsis 
##                                   64

We may also want to see the top 5 cause of death of each area

top_5_by_site <- training_set_copy[,c("site","gs_text34")]%>%
  group_by(site,gs_text34)%>%
  dplyr::summarise(count_cause=n())%>%
  arrange(site,desc(count_cause))%>%
  filter(row_number()==1:5)
top_5_by_site 
## # A tibble: 30 x 3
## # Groups:   site [6]
##    site  gs_text34                            count_cause
##    <fct> <fct>                                      <int>
##  1 AP    Pneumonia                                     36
##  2 AP    Other Defined Causes of Child Deaths          15
##  3 AP    Diarrhea/Dysentery                            11
##  4 AP    Sepsis                                        10
##  5 AP    Drowning                                       9
##  6 Bohol Pneumonia                                     72
##  7 Bohol Other Digestive Diseases                      10
##  8 Bohol Diarrhea/Dysentery                             9
##  9 Bohol Sepsis                                         9
## 10 Bohol Hemorrhagic fever                              7
## # ... with 20 more rows
var_list <- names(training_set_copy_top5)
var_list
##  [1] "site"                                          
##  [2] "gs_text34"                                     
##  [3] "Num_People_Live_at_Address"                    
##  [4] "Num_Rooms_in_Household"                        
##  [5] "Separate_Room_for_Cooking"                     
##  [6] "Singleton_or_Multiple_Birth"                   
##  [7] "Mother_Living_or_Deceased"                     
##  [8] "Location_of_Birth"                             
##  [9] "Size_at_Birth"                                 
## [10] "Weight_at_Birth"                               
## [11] "Gender"                                        
## [12] "Did_the_Baby_Cry"                              
## [13] "Did_the_Baby_Move"                             
## [14] "Did_the_Baby_Breathe"                          
## [15] "Age_at_Onset_of_Illness"                       
## [16] "Duration_of_Illness"                           
## [17] "Location_of_Death"                             
## [18] "Age_at_Time_of_Death"                          
## [19] "Fever_During_Illness"                          
## [20] "Duration_of_Fever_in_Days"                     
## [21] "Did_the_Fever_Continue_to_Death"               
## [22] "Severity_of_Fever"                             
## [23] "Fever_Pattern"                                 
## [24] "Loose_Liquid_Stool"                            
## [25] "Highest_Num_Loose_Stool_per_Day_During_Illness"
## [26] "Num_Days_Before_Death_Loose_Stool_Began"       
## [27] "Loose_Stool_Cont_Until_Death"                  
## [28] "Num_Days_Before_Death_Loose_Stool_Stopped"     
## [29] "Blood_in_Stool"                                
## [30] "Cough_During_Illness"                          
## [31] "Duration_of_Cough"                             
## [32] "Severity_of_Cough"                             
## [33] "Vomitus_after_Coughing"                        
## [34] "Difficulty_Breathing"                          
## [35] "Duration_of_Difficulty_Breathing"              
## [36] "Fast_Breathing"                                
## [37] "Duration_of_Fast_Breathing"                    
## [38] "Indrawing_of_Chest"                            
## [39] "Breathing_Stridor"                             
## [40] "Breathing_Grunting"                            
## [41] "Breathing_Wheezing"                            
## [42] "Convulsions"                                   
## [43] "Loss_of_Consciousness"                         
## [44] "Duration_Before_Death_LOC_Occurred"            
## [45] "Stiff_Neck"                                    
## [46] "Bulging_Fontanelle"                            
## [47] "Skin_Rash"                                     
## [48] "Duration_of_Rash"                              
## [49] "Blisters_Present_in_Rash"                      
## [50] "Limbs_Become_Thin"                             
## [51] "Swollen_Legs_or_Feet"                          
## [52] "Duration_of_Swelling"                          
## [53] "Skin_Flake_Off_in_Patches"                     
## [54] "Hair_Color_Change_to_Red_Yellow"               
## [55] "Protruding_Belly"                              
## [56] "Pallor_or_Lack_of_Blood"                       
## [57] "Swelling_in_Armpits"                           
## [58] "Whitish_Rash_in_Mouth"                         
## [59] "Bleeding_Seen"                                 
## [60] "Skin_Turned_Black"                             
## [61] "Suffered_Road_Traffic_Injury"                  
## [62] "Suffered_a_Fall"                               
## [63] "Suffered_Drowning"                             
## [64] "Suffered_Poisoning"                            
## [65] "Suffered_Bite_Sting"                           
## [66] "Suffered_Burn_Fire"                            
## [67] "Victim_of_Violence"                            
## [68] "Other_Injury"                                  
## [69] "Unsure_if_Injury_Occurred"                     
## [70] "Refused_to_Answer_if_Deceased_Suffered_Injury" 
## [71] "Did_Not_Suffer_Injury"                         
## [72] "Injury_Intentionally_Inflicted_by_Someone"     
## [73] "Days_Survived_After_Injury"                    
## [74] "Sought_Care_While_Ill"                         
## [75] "Care_Sought_Traditional_Healer"                
## [76] "Care_Sought_Homeopath"                         
## [77] "Care_Sought_Religious_Leader"                  
## [78] "Care_Sought_Governmental_Hospital"             
## [79] "Care_Sought_Governmental_Health_Center_Clinic" 
## [80] "Care_Sought_Private_Hospital"                  
## [81] "Care_Sought_Community_Based_Practioner"        
## [82] "Care_Sought_Trained_Birth_Attendant"           
## [83] "Care_Sought_Private_Physician"                 
## [84] "Care_Sought_Pharmacy"                          
## [85] "Care_Sought_Other_Provider"                    
## [86] "Care_Sought_Relative_Friend"                   
## [87] "Health_Records_for_Deceased"                   
## [88] "Granted_Access_to_Health_Records"              
## [89] "Death_Certificate_Issued"                      
## [90] "Granted_Access_to_Death_Certificate"           
## [91] "Mother_Ever_Tested_for_HIV"                    
## [92] "Mother_HIV_Positive"                           
## [93] "Mother_AIDS_Positive"                          
## [94] "DOB"                                           
## [95] "DOD"                                           
## [96] "age_death"

Plot grouped box plots for numerical variables with top 5 cause of death (across the whole dataset)

var_list <- names(training_set_copy_top5)
grouped_box_list <- list()
z <- 0
for (i in 3:length(var_list)){
  if (is.numeric(training_set_copy_top5[,i])){
    z <- z+1
    grouped_box <- ggplot(data=training_set_copy_top5,aes_string(x=var_list[1],y=var_list[i],fill=var_list[2]))+
      geom_boxplot()+
      theme(axis.text=element_text(size=15,angle=45,hjust=1),axis.title=element_text(size=15),legend.key.size=unit(2,"cm"),legend.title=element_text(size=15),legend.text=element_text(size=15))
    grouped_box_list[[z]] <- grouped_box
  }
}
grouped_box_list
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

Plot grouped bar charts for categorical variables with the top 5 cause of death (across the whole dataset) (I only plot for “Pneumonia”,“Diarrhea/Dysentery”,“Sepsis”)

library(dplyr)
top_5 <-c("Pneumonia","Diarrhea/Dysentery","Other Defined Causes of Child Deaths","Sepsis", "Malaria")

Compute disease prevalence by site. The number will be used to define if a given cause is more prevalent in one group than another across different sites.

cause_prev_by_site <- training_set_copy[,c("site","gs_text34")]%>%
  group_by(site,gs_text34)%>%
  dplyr::summarise(count_cause=n())%>%
  dplyr::mutate(prevalence=count_cause/sum(count_cause))%>%
  arrange(site,desc(count_cause))
cause_prev_by_site
## # A tibble: 84 x 4
## # Groups:   site [6]
##    site  gs_text34                            count_cause prevalence
##    <fct> <fct>                                      <int>      <dbl>
##  1 AP    Pneumonia                                     36     0.259 
##  2 AP    Other Defined Causes of Child Deaths          15     0.108 
##  3 AP    Diarrhea/Dysentery                            11     0.0791
##  4 AP    Sepsis                                        10     0.0719
##  5 AP    Drowning                                       9     0.0647
##  6 AP    Hemorrhagic fever                              9     0.0647
##  7 AP    Road Traffic                                   9     0.0647
##  8 AP    Fires                                          8     0.0576
##  9 AP    Other Cardiovascular Diseases                  6     0.0432
## 10 AP    Violent Death                                  5     0.0360
## # ... with 74 more rows
# Pneumonia
grouped_bar_Pneumonia <- list()
z <- 0
for (i in 3:length(names(training_set_copy))){
  if (is.factor(training_set_copy[,i]) & names(training_set_copy)[i]!="site" & names(training_set_copy)[i]!="gs_text34"){
    z <- z+1
    grouped_class <- training_set_copy[training_set_copy$gs_text34=="Pneumonia",c(1,i)]%>%
      group_by_all()%>%
     dplyr::summarise(count=n())%>%
      group_by(site)%>%
      dplyr::mutate(perc=count/sum(count))
    grouped_bar <- ggplot(data=grouped_class, aes_string(x=names(grouped_class)[2], y=names(grouped_class)[4],fill=names(grouped_class)[2]))+geom_bar(position = 'dodge', stat ='identity')+labs(x=names(grouped_class)[2], y ="perc")+ ggtitle(labs(title = "Cause of death: Pneumonia"))+ theme(text = element_text(size=12),axis.title=element_text(size=12),axis.text=element_text(size=12,hjust=1,angle = 45))+ geom_text(aes(label = scales::percent(perc),y = perc,size=10),position=position_dodge(width=1), vjust =-0.5,size=4)+ facet_grid(. ~ site,scales='free')
    grouped_bar_Pneumonia[[z]] <- grouped_bar
  }
}
grouped_bar_Pneumonia
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

## 
## [[31]]

## 
## [[32]]

## 
## [[33]]

## 
## [[34]]

## 
## [[35]]

## 
## [[36]]

## 
## [[37]]

## 
## [[38]]

## 
## [[39]]

## 
## [[40]]

## 
## [[41]]

## 
## [[42]]

## 
## [[43]]

## 
## [[44]]

## 
## [[45]]

## 
## [[46]]

## 
## [[47]]

## 
## [[48]]

## 
## [[49]]

## 
## [[50]]

## 
## [[51]]

## 
## [[52]]

## 
## [[53]]

## 
## [[54]]

## 
## [[55]]

## 
## [[56]]

## 
## [[57]]

## 
## [[58]]

## 
## [[59]]

## 
## [[60]]

## 
## [[61]]

## 
## [[62]]

## 
## [[63]]

## 
## [[64]]

## 
## [[65]]

## 
## [[66]]

## 
## [[67]]

## 
## [[68]]

## 
## [[69]]

## 
## [[70]]

## 
## [[71]]

## 
## [[72]]

## 
## [[73]]

## 
## [[74]]

## 
## [[75]]

# "Diarrhea/Dysentery"
grouped_bar_Diarrhea <- list()
z <- 0
for (i in 3:length(names(training_set_copy))){
  if (is.factor(training_set_copy[,i]) & names(training_set_copy)[i]!="site" & names(training_set_copy)[i]!="gs_text34"){
    z <- z+1
    grouped_class <- training_set_copy[training_set_copy$gs_text34=="Diarrhea/Dysentery",c(1,i)]%>%
      group_by_all()%>%
     dplyr::summarise(count=n())%>%
      group_by(site)%>%
      dplyr::mutate(perc=count/sum(count))
    grouped_bar <- ggplot(data=grouped_class, aes_string(x=names(grouped_class)[2], y=names(grouped_class)[4],fill=names(grouped_class)[2]))+geom_bar(position = 'dodge', stat ='identity')+labs(x=names(grouped_class)[2], y ="perc")+ ggtitle(labs(title = "Cause of death: Diarrhea/Dysentery"))+ theme(text = element_text(size=12),axis.title=element_text(size=12),axis.text=element_text(size=12,hjust=1,angle = 45))+ geom_text(aes(label = scales::percent(perc),y = perc,size=10),position=position_dodge(width=1), vjust =-0.5,size=4)+ facet_grid(. ~ site,scales='free')
    grouped_bar_Pneumonia[[z]] <- grouped_bar
  }
}
grouped_bar_Pneumonia
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

## 
## [[31]]

## 
## [[32]]

## 
## [[33]]

## 
## [[34]]

## 
## [[35]]

## 
## [[36]]

## 
## [[37]]

## 
## [[38]]

## 
## [[39]]

## 
## [[40]]

## 
## [[41]]

## 
## [[42]]

## 
## [[43]]

## 
## [[44]]

## 
## [[45]]

## 
## [[46]]

## 
## [[47]]

## 
## [[48]]

## 
## [[49]]

## 
## [[50]]

## 
## [[51]]

## 
## [[52]]

## 
## [[53]]

## 
## [[54]]

## 
## [[55]]

## 
## [[56]]

## 
## [[57]]

## 
## [[58]]

## 
## [[59]]

## 
## [[60]]

## 
## [[61]]

## 
## [[62]]

## 
## [[63]]

## 
## [[64]]

## 
## [[65]]

## 
## [[66]]

## 
## [[67]]

## 
## [[68]]

## 
## [[69]]

## 
## [[70]]

## 
## [[71]]

## 
## [[72]]

## 
## [[73]]

## 
## [[74]]

## 
## [[75]]

# Sepsis
grouped_bar_sepsis <- list()
z <- 0
for (i in 3:length(names(training_set_copy))){
  if (is.factor(training_set_copy[,i]) & names(training_set_copy)[i]!="site" & names(training_set_copy)[i]!="gs_text34"){
    z <- z+1
    grouped_class <- training_set_copy[training_set_copy$gs_text34=="Sepsis",c(1,i)]%>%
      group_by_all()%>%
     dplyr::summarise(count=n())%>%
      group_by(site)%>%
      dplyr::mutate(perc=count/sum(count))
    grouped_bar <- ggplot(data=grouped_class, aes_string(x=names(grouped_class)[2], y=names(grouped_class)[4],fill=names(grouped_class)[2]))+geom_bar(position = 'dodge', stat ='identity')+labs(x=names(grouped_class)[2], y ="perc")+ ggtitle(labs(title = "Cause of death: Sepsis"))+ theme(text = element_text(size=12),axis.title=element_text(size=12),axis.text=element_text(size=12,hjust=1,angle = 45))+ geom_text(aes(label = scales::percent(perc),y = perc,size=10),position=position_dodge(width=1), vjust =-0.5,size=4)+ facet_grid(. ~ site,scales='free')
    grouped_bar_sepsis[[z]] <- grouped_bar
  }
}
grouped_bar_sepsis
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

## 
## [[31]]

## 
## [[32]]

## 
## [[33]]

## 
## [[34]]

## 
## [[35]]

## 
## [[36]]

## 
## [[37]]

## 
## [[38]]

## 
## [[39]]

## 
## [[40]]

## 
## [[41]]

## 
## [[42]]

## 
## [[43]]

## 
## [[44]]

## 
## [[45]]

## 
## [[46]]

## 
## [[47]]

## 
## [[48]]

## 
## [[49]]

## 
## [[50]]

## 
## [[51]]

## 
## [[52]]

## 
## [[53]]

## 
## [[54]]

## 
## [[55]]

## 
## [[56]]

## 
## [[57]]

## 
## [[58]]

## 
## [[59]]

## 
## [[60]]

## 
## [[61]]

## 
## [[62]]

## 
## [[63]]

## 
## [[64]]

## 
## [[65]]

## 
## [[66]]

## 
## [[67]]

## 
## [[68]]

## 
## [[69]]

## 
## [[70]]

## 
## [[71]]

## 
## [[72]]

## 
## [[73]]

## 
## [[74]]

## 
## [[75]]

EDA Visualization of Gender of Deceased

gen <- ggplot(data = training_set_copy) + 
  geom_bar(aes(x = Gender, fill = Gender), stat = "Count")
print(gen)